This document aims at exploring two datasets, one in 2016 on 6 individuals and another one in 2018 on 4 individuals. For that purpose, we need first to load the weanlingNES package to load data.
# load library
library(weanlingNES)
# load data
data("data_nes", package = "weanlingNES")
Let’s have a look at what’s inside data_nes$data_2016:
# list structure
str(data_nes$year_2016, max.level = 1, give.attr = F, no.list = T)
## $ ind_3449:Classes 'data.table' and 'data.frame': 384 obs. of 23 variables:
## $ ind_3450:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3456:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3457:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3460:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3463:Classes 'data.table' and 'data.frame': 213 obs. of 23 variables:
A list of 0
data.frames, one for each seal
For convenience, we aggregate all 0 individuals into one dataset.
# combine all individuals
data_2016 = rbindlist(data_nes$year_2016, use.name = TRUE, idcol = TRUE)
# display
DT::datatable(data_2016[sample.int(.N,100),],options=list(scrollX=T))
# raw_data
data_2016[, .(
nb_days_recorded = uniqueN(as.Date(date)),
max_depth = max(maxpress_dbars),
sst_mean = mean(sst2_c),
sst_sd = sd(sst2_c)
), by =.id] %>%
sable(caption="Summary diving information relative to each 2016 individual",
digits=2)
| .id | nb_days_recorded | max_depth | sst_mean | sst_sd |
|---|---|---|---|---|
| ind_3449 | 384 | 1118.81 | 26.54 | 129.91 |
| ind_3450 | 253 | 954.81 | 130.29 | 367.69 |
| ind_3456 | 253 | 697.63 | 125.24 | 360.39 |
| ind_3457 | 253 | 572.94 | 135.56 | 374.71 |
| ind_3460 | 253 | 832.25 | 65.24 | 249.12 |
| ind_3463 | 213 | 648.81 | 212.19 | 462.88 |
Well, it seems that sst is a bit odd. Let’s have a look at its distribution.
ggplot(data_2016, aes(x = sst2_c, fill = .id)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(.id ~ .) +
theme_jjo()
Figure 1.1: Distribution of raw sst2 for the four individuals in 2016
Let’s remove any data with a sst2_c > 500.
data_2016_filter = data_2016[sst2_c < 500, ]
ggplot(data_2016_filter, aes(x = sst2_c, fill = .id)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(.id ~ .) +
theme_jjo()
Figure 1.2: Distribution of filtered sst2 for the four individuals in 2016
Well,that seems to be much better! In the process of filtering out odd values, we removed 116 rows this way:
# nbrow removed
data_2016[sst2_c>500,.(nb_row_removed = .N),by=.id] %>%
sable(caption = "# of rows removed by 2016-individuals")
| .id | nb_row_removed |
|---|---|
| ind_3449 | 4 |
| ind_3450 | 23 |
| ind_3456 | 22 |
| ind_3457 | 24 |
| ind_3460 | 10 |
| ind_3463 | 33 |
ggplot(data_2016,
aes(y = -maxpress_dbars, x=as.Date(date), col=.id)) +
geom_path(show.legend = FALSE) +
geom_point(data = data_2016[sst2_c>500,], col="black") +
scale_x_date(date_labels = "%m/%Y") +
labs(y="Pressure (dbar)", x="Date") +
facet_wrap(.id ~ .) +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust=1))
Figure 1.3: Where and when the sst2 outliers occured
Well this latter plot highlights several points:
ind_3449 seems to have return ashore twice during his trackLet’s see if we can double check that using some maps.
# interactive map
leaflet() %>%
setView(lng = -122, lat = 38, zoom = 2) %>%
addTiles() %>%
addPolylines(lat = data_2016[.id == "ind_3449", latitude_degs],
lng = data_2016[.id == "ind_3449", longitude_degs],
weight = 2) %>%
addCircleMarkers(lat = data_2016[.id == "ind_3449" & sst2_c>500, latitude_degs],
lng = data_2016[.id == "ind_3449" & sst2_c>500, longitude_degs],
radius = 3,
stroke=FALSE,
color="red",
fillOpacity=1)
Figure 1.4: It is supposed to be the track of ind_3449… (red dots are the location of removed rows)
… these coordinates seem weird !
# summary of the coordinates by individuals
data_2016[, .(.id, longitude_degs, latitude_degs)] %>%
tbl_summary(by = .id) %>%
modify_caption("Summary of `longitude_degree` and `latitude_degree`")
| Characteristic | ind_3449, N = 3841 | ind_3450, N = 2531 | ind_3456, N = 2531 | ind_3457, N = 2531 | ind_3460, N = 2531 | ind_3463, N = 2131 |
|---|---|---|---|---|---|---|
| longitude_degs | -119 (-132, -69) | -124 (-144, -99) | -112 (-134, -6) | -122 (-132, -97) | -122 (-144, -85) | -121 (-134, -93) |
| latitude_degs | 39 (-67, 68) | 60 (-63, 68) | 56 (-63, 68) | 44 (-63, 68) | 59 (-63, 68) | 63 (42, 72) |
|
1
Median (IQR)
|
||||||
# distribution coordinates
ggplot(
data = melt(data_2016[, .(Longitude = longitude_degs,
Latitude = latitude_degs,
.id)], id.vars =
".id", value.name = "Coordinate"),
aes(x = Coordinate, fill = .id)
) +
geom_histogram(show.legend = F) +
facet_grid(variable ~ .id) +
theme_jjo()
Figure 1.5: Distribution of coordinates per seal
There is definitely something wrong with these coordinates (five of the seals would have crossed the equator…), but the representation of the track can also be improved! Here are the some points to explore:
longitude a part of the data seems to have the wrong sign, resulting in these distribution that appear to be cut offlatitude, well this is ensure but maybe the same problem occurLet’s try to play on coordinates’ sign to see if we can display something that makes more sense.
# interactive map
leaflet() %>%
setView(lng = -122, lat = 50, zoom = 3) %>%
addTiles() %>%
addPolylines(lat = data_2016[.id == "ind_3449", abs(latitude_degs)],
lng = data_2016[.id == "ind_3449", -abs(longitude_degs)],
weight = 2)
Figure 1.6: An attempt to display the ind_3449’s track
I’ll better ask Roxanne!
Let’s have a look at what’s inside data_nes$data_2018:
# list structure
str(data_nes$year_2018, max.level = 1, give.attr = F, no.list = T)
## $ ind_2018070:Classes 'data.table' and 'data.frame': 22393 obs. of 42 variables:
## $ ind_2018072:Classes 'data.table' and 'data.frame': 29921 obs. of 42 variables:
## $ ind_2018074:Classes 'data.table' and 'data.frame': 38608 obs. of 42 variables:
## $ ind_2018080:Classes 'data.table' and 'data.frame': 19028 obs. of 42 variables:
A list of 0
data.frames, one for each seal
For convenience, we aggregate all 0 individuals into one dataset.
# combine all individuals
data_2018 = rbindlist(data_nes$year_2018, use.name = TRUE, idcol = TRUE)
# display
DT::datatable(data_2018[sample.int(.N,100),],options=list(scrollX=T))
# raw_data
data_2018[, .(
nb_days_recorded = uniqueN(as.Date(date)),
nb_dives = .N,
maxdepth_mean = mean(maxdepth),
dduration_mean = mean(dduration),
botttime_mean = mean(botttime),
pdi_mean = mean(pdi, na.rm=T)
), by =.id] %>%
sable(caption="Summary diving information relative to each 2018 individual",
digits=2)
| .id | nb_days_recorded | nb_dives | maxdepth_mean | dduration_mean | botttime_mean | pdi_mean |
|---|---|---|---|---|---|---|
| ind_2018070 | 232 | 22393 | 305.52 | 783.27 | 243.22 | 109.55 |
| ind_2018072 | 342 | 29921 | 357.86 | 876.96 | 278.02 | 104.90 |
| ind_2018074 | 371 | 38608 | 250.67 | 686.25 | 291.89 | 302.77 |
| ind_2018080 | 215 | 19028 | 296.50 | 867.69 | 339.90 | 103.51 |
Very nice dataset :)
# build dataset to check for missing values
dataPlot = melt(data_2018[, .(.id, is.na(.SD)), .SDcol = -c(".id",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"date")])
# add the id of rows
dataPlot[, id_row := c(1:.N), by = c("variable",".id")]
# plot
ggplot(dataPlot, aes(x = variable, y = id_row, fill = value)) +
geom_tile() +
labs(x = "Attributes", y = "Rows") +
scale_fill_manual(values = c("white", "black"),
labels = c("Real", "Missing")) +
facet_wrap(.id ~ ., scales = "free_y") +
theme_jjo() +
theme(
legend.position = "top",
axis.text.x = element_text(angle = 45, hjust = 1),
legend.key = element_rect(colour = "black")
)
Figure 2.1: Check for missing values
So far so good, only few variables seems to have missing values:
# table with percent
table_inter = data_2018[, lapply(.SD, function(x) {
round(length(x[is.na(x)]) * 100 / length(x), 1)
}), .SDcol = -c(
".id",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"date"
)]
# find which are different from 0
cond_inter = sapply(table_inter,function(x){x==0})
# display the percentages that are over 0
table_inter[,which(cond_inter) := NULL] %>%
sable(caption="Percentage of missing values per columns having missing values!")%>%
scroll_box(width = "100%")
| lightatsurf | lattenuation | euphoticdepth | thermoclinedepth | driftrate | benthicdivevertrate | cornerindex | foragingindex | verticalspeed90perc | verticalspeed95perc |
|---|---|---|---|---|---|---|---|---|---|
| 26.3 | 89 | 62.6 | 1.3 | 0.5 | 22.7 | 75.8 | 0.5 | 0.1 | 0.1 |
Ok, let’s have a look at all the data. But first, we have to remove outliers. Some of them are quiet easy to spot looking at the distribution of dive duration:
ggplot(data_2018[,.SD][,state:="Before"],
aes(x=dduration, fill = .id))+
geom_histogram(show.legend = FALSE)+
geom_vline(xintercept = 3000) +
facet_grid(state~.id,
scales="free")+
labs(x="# of dives", y="Dive duration (s)")+
theme_jjo()
Figure 2.2: Distribution of dduration for each seal. The red line highlight the “subjective” threshold used to remove outliers
ggplot(data_2018[dduration<3000,][][,state:="After"],
aes(x=dduration, fill = .id))+
geom_histogram(show.legend = FALSE)+
geom_vline(xintercept = 3000) +
facet_grid(state~.id,
scales="free")+
labs(x="# of dives", y="Dive duration (s)")+
theme_jjo()
Figure 2.3: Same distribution of dduration for each seal but after removing any dduration < 3000 sec. The red line highlight the “subjective” threshold used to remove outliers
It seems muche better, so let’s remove any rows with dduration < 3000
# filter data
data_2018_filter = data_2018[dduration < 3000, ]
# nbrow removed
data_2018[dduration>= 3000,.(nb_row_removed = .N),by=.id] %>%
sable(caption = "# of rows removed by 2018-individuals")
| .id | nb_row_removed |
|---|---|
| ind_2018070 | 3 |
| ind_2018072 | 1 |
| ind_2018074 | 33 |
names_display = names(data_2018_filter[, -c(
".id",
"date",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"euphoticdepth",
"thermoclinedepth",
"day_departure"
)])
for (i in names_display) {
cat('####', i, '{-} \n')
if (i == "maxdepth") {
print(
ggplot() +
geom_point(
data = data_2018_filter[, .(.id,
date,
thermoclinedepth)],
aes(
x = as.Date(date),
y = -thermoclinedepth,
colour = "Thermocline (m)"
),
alpha = .2,
size = .5
) +
geom_point(
data = data_2018_filter[, .(.id,
date,
euphoticdepth)],
aes(
x = as.Date(date),
y = -euphoticdepth,
colour = "Euphotic (m)"
),
alpha = .2,
size = .5
) +
scale_colour_manual(
values = c("Thermocline (m)" = 'red',
"Euphotic (m)" = "black"),
name = "Zone"
) +
new_scale_color() +
geom_point(
data = melt(data_2018_filter[, .(.id, date, get(i))], id.vars = c(".id", "date")),
aes(
x = as.Date(date),
y = -value,
col = .id
),
alpha = 1 / 10,
size = .5,
show.legend = FALSE
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = "Maximum Depth (m)") +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
cat("> Considering `ind_2018074` have slightly different values than other individuals for the thermocline depth, it would be interesting to see where the animal went.")
} else {
print(
ggplot(
data = melt(data_2018_filter[, .(.id, date, get(i))], id.vars = c(".id", "date")),
aes(
x = as.Date(date),
y = value,
col = .id
)
) +
geom_point(
show.legend = FALSE,
alpha = 1 / 10,
size = .5
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = i) +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
cat(' \n \n')
}
> Considering
ind_2018074 have slightly different values than other individuals for the thermocline depth, it would be interesting to see where the animal went.
for (i in names_display) {
cat('####', i, '{-} \n')
if (i == "maxdepth") {
print(
ggplot() +
geom_point(
data = data_2018_filter[day_departure < 32, .(.id,
date,
thermoclinedepth)],
aes(
x = as.Date(date),
y = -thermoclinedepth,
colour = "Thermocline (m)"
),
alpha = .2,
size = .5
) +
geom_point(
data = data_2018_filter[day_departure < 32, .(.id,
date,
euphoticdepth)],
aes(
x = as.Date(date),
y = -euphoticdepth,
colour = "Euphotic (m)"
),
alpha = .2,
size = .5
) +
scale_colour_manual(
values = c("Thermocline (m)" = 'red',
"Euphotic (m)" = "black"),
name = "Zone"
) +
new_scale_color() +
geom_point(
data = melt(data_2018_filter[day_departure < 32, .(.id, date, get(i))], id.vars = c(".id", "date")),
aes(
x = as.Date(date),
y = -value,
col = .id
),
alpha = 1 / 10,
size = .5,
show.legend = FALSE
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = "Maximum Depth (m)") +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
} else {
print(
ggplot(
data = melt(data_2018_filter[day_departure < 32, .(.id, date, get(i))], id.vars = c(".id", "date")),
aes(
x = as.Date(date),
y = value,
col = .id
)
) +
geom_point(
show.legend = FALSE,
alpha = 1 / 10,
size = .5
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = i) +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
cat(' \n \n')
}
Can we find nice correlation
# compute correlation
corr_2018 = round(cor(data_2018_filter[, names_display, with = F],
use = "pairwise.complete.obs"), 1)
# replace NA value by 0
corr_2018[is.na(corr_2018)] = 0
# compute p_values
corr_p_2018 = cor_pmat(data_2018_filter[, names_display, with = F])
# replace NA value by 0
corr_p_2018[is.na(corr_p_2018)] = 1
# display
ggcorrplot(
corr_2018,
p.mat = corr_p_2018,
hc.order = TRUE,
method = "circle",
type = "lower",
ggtheme = theme_jjo(),
sig.level = 0.05,
colors = c("#00AFBB", "#E7B800", "#FC4E07")
)
Figure 2.4: Correlation matrix (crosses indicate non significant correlation)
Another way to see it:
# flatten correlation matrix
cor_result_2018 = flat_cor_mat(corr_2018, corr_p_2018)
# keep only the one above .7
cor_result_2018[cor>=.7,][order(-abs(cor))] %>%
sable(caption="Pairwise correlation above 0.75")
| row | column | cor | p |
|---|---|---|---|
| verticalspeed90perc | verticalspeed95perc | 1.0 | 0 |
| maxdepth | asctime | 0.8 | 0 |
| botttime | efficiency | 0.8 | 0 |
| dwigglesbott | foragingindex | 0.8 | 0 |
| maxdepth | dduration | 0.7 | 0 |
| maxdepth | desctime | 0.7 | 0 |
| dduration | desctime | 0.7 | 0 |
| dduration | asctime | 0.7 | 0 |
| totvertdistbot | bottrange | 0.7 | 0 |
| totvertdistbot | verticalspeed90perc | 0.7 | 0 |
| totvertdistbot | verticalspeed95perc | 0.7 | 0 |
I guess nothing unexpected here, I’ll have to check with Patrick about the efficiency ;)
# dataset to plot proportional area plot
data_2018_filter[, sum_id := .N, by = .(.id, day_departure)][, sum_id_days := .N, by = .(.id, day_departure, divetype)][, prop := sum_id_days /
sum_id]
dataPlot = unique(data_2018_filter[, .(prop, .id, divetype, day_departure)])
# area plot
ggplot(dataPlot, aes(
x = as.numeric(day_departure),
y = prop,
fill = as.character(divetype)
)) +
geom_area(alpha = 0.6 , size = 1) +
facet_wrap(.id ~ ., scales = "free") +
theme_jjo() +
theme(legend.position="bottom") +
labs(x="# of days since departure", y="Proportion of dives", fill = "Dive types")
Figure 2.5: Proportion dive types
# plot
ggplot(data = data_2018_filter, aes(y = dduration, x = maxdepth, col = divetype)) +
geom_point(size = .5, alpha = .1) +
facet_wrap(.id ~ .) +
guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1))) +
labs(x="Maximum depth (m)", y="Dive duration (s)")+
theme_jjo() +
theme(legend.position="bottom")
Figure 2.6: Dive duration vs. Maximum Depth colored by Dive Type
# plot
ggplot(data_2018_filter[,.(driftrate=median(driftrate,na.rm=T),
botttime=median(botttime,na.rm=T),
maxdepth=median(maxdepth,na.rm=T),
dduration=median(dduration,na.rm=T)), by=.(.id,day_departure)],
aes(x=botttime, y=driftrate, col=.id))+
geom_point(size=.5,alpha=.5)+
geom_smooth(method="lm")+
guides(color=FALSE)+
facet_wrap(.id~.)+
scale_x_continuous(limits = c(0,700))+
labs(x = "Daily median Bottom time (s)", y = "Daily median drift rate (m.s-1)")+
theme_jjo()
Figure 2.7: Drift rate vs. Bottom time
# plot
ggplot(data_2018_filter[,.(driftrate=median(driftrate,na.rm=T),
botttime=median(botttime,na.rm=T),
maxdepth=median(maxdepth,na.rm=T),
dduration=median(dduration,na.rm=T)), by=.(.id,day_departure)],
aes(x=maxdepth, y=driftrate, col=.id))+
geom_point(size=.5,alpha=.5)+
geom_smooth(method="lm")+
guides(color=FALSE)+
facet_wrap(.id~.)+
labs(x = "Daily median Maximum depth (m)", y = "Daily median drift rate (m.s-1)")+
theme_jjo()
Figure 2.8: Drift rate vs. Maximum depth
# plot
ggplot(data_2018_filter[,.(driftrate=median(driftrate,na.rm=T),
botttime=median(botttime,na.rm=T),
maxdepth=median(maxdepth,na.rm=T),
dduration=median(dduration,na.rm=T)), by=.(.id,day_departure)],
aes(x=dduration, y=driftrate, col=.id))+
geom_point(size=.5,alpha=.5)+
geom_smooth(method="lm")+
guides(color=FALSE)+
facet_wrap(.id~.)+
labs(x = "Daily median Dive duration (s)", y = "Daily median drift rate (m.s-1)")+
theme_jjo()
Figure 2.9: Drift rate vs. Dive duration